{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'electra-base-discriminator-bahasa-cased'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install transformers -U"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import ElectraTokenizer, ElectraModel, ElectraConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('electra-base-discriminator-bahasa-cased/vocab.txt',\n",
       " 'electra-base-discriminator-bahasa-cased/special_tokens_map.json',\n",
       " 'electra-base-discriminator-bahasa-cased/added_tokens.json')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = ElectraTokenizer('bahasa.wordpiece', do_lower_case = False)\n",
    "tokenizer.save_pretrained('electra-base-discriminator-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "import torch\n",
    "\n",
    "from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra\n",
    "\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "\n",
    "\n",
    "def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):\n",
    "    # Initialise PyTorch model\n",
    "    config = ElectraConfig.from_json_file(config_file)\n",
    "    print(\"Building PyTorch model from configuration: {}\".format(str(config)))\n",
    "\n",
    "    if discriminator_or_generator == \"discriminator\":\n",
    "        model = ElectraForPreTraining(config)\n",
    "    elif discriminator_or_generator == \"generator\":\n",
    "        model = ElectraForMaskedLM(config)\n",
    "    else:\n",
    "        raise ValueError(\"The discriminator_or_generator argument should be either 'discriminator' or 'generator'\")\n",
    "\n",
    "    # Load weights from tf checkpoint\n",
    "    load_tf_weights_in_electra(\n",
    "        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator\n",
    "    )\n",
    "\n",
    "    # Save pytorch-model\n",
    "    print(\"Save PyTorch model to {}\".format(pytorch_dump_path))\n",
    "    torch.save(model.state_dict(), pytorch_dump_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/snap/google-cloud-sdk/127/lib/third_party/requests/__init__.py:83: RequestsDependencyWarning: Old version of cryptography ([1, 2, 3]) may cause slowdown.\n",
      "  warnings.warn(warning, RequestsDependencyWarning)\n",
      "Copying gs://mesolitica-general/electra-base/model.ckpt-60000.data-00000-of-00001...\n",
      "Copying gs://mesolitica-general/electra-base/model.ckpt-60000.index...          \n",
      "Copying gs://mesolitica-general/electra-base/model.ckpt-60000.meta...           \n",
      "| [3 files][  1.4 GiB/  1.4 GiB]   60.0 MiB/s                                   \n",
      "Operation completed over 3 objects/1.4 GiB.                                      \n"
     ]
    }
   ],
   "source": [
    "# !mkdir out\n",
    "# !gsutil cp gs://mesolitica-general/electra-base/model.ckpt-60000* out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "import torch\n",
    "\n",
    "from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra\n",
    "\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "\n",
    "\n",
    "def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):\n",
    "    # Initialise PyTorch model\n",
    "    config = ElectraConfig.from_json_file(config_file)\n",
    "    print(\"Building PyTorch model from configuration: {}\".format(str(config)))\n",
    "\n",
    "    if discriminator_or_generator == \"discriminator\":\n",
    "        model = ElectraForPreTraining(config)\n",
    "    elif discriminator_or_generator == \"generator\":\n",
    "        model = ElectraForMaskedLM(config)\n",
    "    else:\n",
    "        raise ValueError(\"The discriminator_or_generator argument should be either 'discriminator' or 'generator'\")\n",
    "\n",
    "    # Load weights from tf checkpoint\n",
    "    load_tf_weights_in_electra(\n",
    "        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator\n",
    "    )\n",
    "\n",
    "    # Save pytorch-model\n",
    "    print(\"Save PyTorch model to {}\".format(pytorch_dump_path))\n",
    "    torch.save(model.state_dict(), pytorch_dump_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building PyTorch model from configuration: ElectraConfig {\n",
      "  \"_num_labels\": 2,\n",
      "  \"architectures\": null,\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bad_words_ids\": null,\n",
      "  \"bos_token_id\": null,\n",
      "  \"decoder_start_token_id\": null,\n",
      "  \"do_sample\": false,\n",
      "  \"early_stopping\": false,\n",
      "  \"embedding_size\": 768,\n",
      "  \"eos_token_id\": null,\n",
      "  \"finetuning_task\": null,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"id2label\": {\n",
      "    \"0\": \"LABEL_0\",\n",
      "    \"1\": \"LABEL_1\"\n",
      "  },\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"is_decoder\": false,\n",
      "  \"is_encoder_decoder\": false,\n",
      "  \"label2id\": {\n",
      "    \"LABEL_0\": 0,\n",
      "    \"LABEL_1\": 1\n",
      "  },\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"length_penalty\": 1.0,\n",
      "  \"max_length\": 20,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"min_length\": 0,\n",
      "  \"model_type\": \"electra\",\n",
      "  \"no_repeat_ngram_size\": 0,\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_beams\": 1,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"num_return_sequences\": 1,\n",
      "  \"output_attentions\": false,\n",
      "  \"output_hidden_states\": false,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"prefix\": null,\n",
      "  \"pruned_heads\": {},\n",
      "  \"repetition_penalty\": 1.0,\n",
      "  \"task_specific_params\": null,\n",
      "  \"temperature\": 1.0,\n",
      "  \"top_k\": 50,\n",
      "  \"top_p\": 1.0,\n",
      "  \"torchscript\": false,\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"use_bfloat16\": false,\n",
      "  \"vocab_size\": 32000\n",
      "}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.modeling_electra:Converting TensorFlow checkpoint from /home/ubuntu/notebook/electra/out/model.ckpt-60000\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/bias with shape [1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/bias/adam_m with shape [1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/bias/adam_v with shape [1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/kernel with shape [768, 1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/kernel/adam_m with shape [768, 1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/kernel/adam_v with shape [768, 1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/position_embeddings with shape [512, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/position_embeddings/adam_m with shape [512, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/position_embeddings/adam_v with shape [512, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/token_type_embeddings with shape [2, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/token_type_embeddings/adam_m with shape [2, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/token_type_embeddings/adam_v with shape [2, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/word_embeddings with shape [32000, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/word_embeddings/adam_m with shape [32000, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/word_embeddings/adam_v with shape [32000, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/kernel with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/kernel/adam_m with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/kernel/adam_v with shape [768, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/bias with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/bias/adam_m with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/bias/adam_v with shape [3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/kernel with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/kernel/adam_m with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/kernel/adam_v with shape [768, 3072]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/kernel with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/kernel/adam_m with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/kernel/adam_v with shape [3072, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/kernel with shape [768, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/kernel/adam_m with shape [768, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/kernel/adam_v with shape [768, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/beta with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/beta/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/beta/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/gamma with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/gamma/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/gamma/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/bias with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/bias/adam_m with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/bias/adam_v with shape [768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/kernel with shape [256, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/kernel/adam_m with shape [256, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/kernel/adam_v with shape [256, 768]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/output_bias with shape [32000]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/output_bias/adam_m with shape [32000]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/output_bias/adam_v with shape [32000]\n",
      "INFO:transformers.modeling_electra:Loading TF weight global_step with shape []\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialize PyTorch weight ['discriminator_predictions', 'dense', 'bias'] discriminator_predictions/dense/bias\n",
      "Skipping discriminator_predictions/dense/bias/adam_m ['discriminator_predictions', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping discriminator_predictions/dense/bias/adam_v ['discriminator_predictions', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['discriminator_predictions', 'dense', 'kernel'] discriminator_predictions/dense/kernel\n",
      "Skipping discriminator_predictions/dense/kernel/adam_m ['discriminator_predictions', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping discriminator_predictions/dense/kernel/adam_v ['discriminator_predictions', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['discriminator_predictions', 'dense_prediction', 'bias'] discriminator_predictions/dense_1/bias\n",
      "Skipping discriminator_predictions/dense_1/bias/adam_m ['discriminator_predictions', 'dense_prediction', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping discriminator_predictions/dense_1/bias/adam_v ['discriminator_predictions', 'dense_prediction', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['discriminator_predictions', 'dense_prediction', 'kernel'] discriminator_predictions/dense_1/kernel\n",
      "Skipping discriminator_predictions/dense_1/kernel/adam_m ['discriminator_predictions', 'dense_prediction', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping discriminator_predictions/dense_1/kernel/adam_v ['discriminator_predictions', 'dense_prediction', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'LayerNorm', 'beta'] electra/embeddings/LayerNorm/beta\n",
      "Skipping electra/embeddings/LayerNorm/beta/adam_m ['electra', 'embeddings', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/LayerNorm/beta/adam_v ['electra', 'embeddings', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'LayerNorm', 'gamma'] electra/embeddings/LayerNorm/gamma\n",
      "Skipping electra/embeddings/LayerNorm/gamma/adam_m ['electra', 'embeddings', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/LayerNorm/gamma/adam_v ['electra', 'embeddings', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'position_embeddings'] electra/embeddings/position_embeddings\n",
      "Skipping electra/embeddings/position_embeddings/adam_m ['electra', 'embeddings', 'position_embeddings', 'adam_m'] 'Embedding' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/position_embeddings/adam_v ['electra', 'embeddings', 'position_embeddings', 'adam_v'] 'Embedding' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'token_type_embeddings'] electra/embeddings/token_type_embeddings\n",
      "Skipping electra/embeddings/token_type_embeddings/adam_m ['electra', 'embeddings', 'token_type_embeddings', 'adam_m'] 'Embedding' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/token_type_embeddings/adam_v ['electra', 'embeddings', 'token_type_embeddings', 'adam_v'] 'Embedding' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'word_embeddings'] electra/embeddings/word_embeddings\n",
      "Skipping electra/embeddings/word_embeddings/adam_m ['electra', 'embeddings', 'word_embeddings', 'adam_m'] 'Embedding' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/word_embeddings/adam_v ['electra', 'embeddings', 'word_embeddings', 'adam_v'] 'Embedding' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_0/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_0/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_0/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_0/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_0/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_0/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_0/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_0/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_0/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_0/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_0/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_0/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_0/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_0/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_0/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_0/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias'] electra/encoder/layer_0/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_0/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_0/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_0/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'dense', 'bias'] electra/encoder/layer_0/output/dense/bias\n",
      "Skipping electra/encoder/layer_0/output/dense/bias/adam_m ['electra', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/output/dense/bias/adam_v ['electra', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'dense', 'kernel'] electra/encoder/layer_0/output/dense/kernel\n",
      "Skipping electra/encoder/layer_0/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_0/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_1/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_1/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_1/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_1/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_1/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_1/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_1/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_1/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_1/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_1/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_1/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_1/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_1/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_1/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_1/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_1/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias'] electra/encoder/layer_1/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_1/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_1/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_1/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'dense', 'bias'] electra/encoder/layer_1/output/dense/bias\n",
      "Skipping electra/encoder/layer_1/output/dense/bias/adam_m ['electra', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/output/dense/bias/adam_v ['electra', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'dense', 'kernel'] electra/encoder/layer_1/output/dense/kernel\n",
      "Skipping electra/encoder/layer_1/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_1/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_10/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_10/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_10/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_10/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_10/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_10/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_10/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_10/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_10/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_10/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_10/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_10/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_10/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_10/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_10/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_10/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias'] electra/encoder/layer_10/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_10/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_10/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_10/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'dense', 'bias'] electra/encoder/layer_10/output/dense/bias\n",
      "Skipping electra/encoder/layer_10/output/dense/bias/adam_m ['electra', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/output/dense/bias/adam_v ['electra', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'dense', 'kernel'] electra/encoder/layer_10/output/dense/kernel\n",
      "Skipping electra/encoder/layer_10/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_10/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_11/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_11/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_11/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_11/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_11/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_11/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_11/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_11/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_11/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_11/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_11/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_11/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_11/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_11/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_11/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_11/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias'] electra/encoder/layer_11/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_11/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_11/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_11/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'dense', 'bias'] electra/encoder/layer_11/output/dense/bias\n",
      "Skipping electra/encoder/layer_11/output/dense/bias/adam_m ['electra', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/output/dense/bias/adam_v ['electra', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'dense', 'kernel'] electra/encoder/layer_11/output/dense/kernel\n",
      "Skipping electra/encoder/layer_11/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_11/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_2/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_2/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_2/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_2/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_2/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_2/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_2/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_2/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_2/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_2/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_2/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_2/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_2/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_2/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_2/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_2/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias'] electra/encoder/layer_2/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_2/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_2/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_2/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'dense', 'bias'] electra/encoder/layer_2/output/dense/bias\n",
      "Skipping electra/encoder/layer_2/output/dense/bias/adam_m ['electra', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/output/dense/bias/adam_v ['electra', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'dense', 'kernel'] electra/encoder/layer_2/output/dense/kernel\n",
      "Skipping electra/encoder/layer_2/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_2/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_3/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_3/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_3/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_3/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_3/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_3/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_3/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_3/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_3/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_3/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_3/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_3/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_3/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_3/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_3/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_3/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias'] electra/encoder/layer_3/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_3/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_3/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_3/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'dense', 'bias'] electra/encoder/layer_3/output/dense/bias\n",
      "Skipping electra/encoder/layer_3/output/dense/bias/adam_m ['electra', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/output/dense/bias/adam_v ['electra', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'dense', 'kernel'] electra/encoder/layer_3/output/dense/kernel\n",
      "Skipping electra/encoder/layer_3/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_3/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_4/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_4/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_4/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_4/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_4/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_4/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_4/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_4/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_4/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_4/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_4/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_4/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_4/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_4/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_4/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_4/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias'] electra/encoder/layer_4/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_4/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_4/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_4/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'dense', 'bias'] electra/encoder/layer_4/output/dense/bias\n",
      "Skipping electra/encoder/layer_4/output/dense/bias/adam_m ['electra', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/output/dense/bias/adam_v ['electra', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'dense', 'kernel'] electra/encoder/layer_4/output/dense/kernel\n",
      "Skipping electra/encoder/layer_4/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_4/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_5/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_5/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_5/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_5/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_5/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_5/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_5/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_5/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_5/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_5/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_5/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_5/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_5/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_5/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_5/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_5/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias'] electra/encoder/layer_5/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_5/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_5/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_5/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'dense', 'bias'] electra/encoder/layer_5/output/dense/bias\n",
      "Skipping electra/encoder/layer_5/output/dense/bias/adam_m ['electra', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/output/dense/bias/adam_v ['electra', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'dense', 'kernel'] electra/encoder/layer_5/output/dense/kernel\n",
      "Skipping electra/encoder/layer_5/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_5/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_6/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_6/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_6/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_6/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_6/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_6/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_6/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_6/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_6/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_6/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_6/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_6/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_6/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_6/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_6/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_6/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias'] electra/encoder/layer_6/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_6/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_6/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_6/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'dense', 'bias'] electra/encoder/layer_6/output/dense/bias\n",
      "Skipping electra/encoder/layer_6/output/dense/bias/adam_m ['electra', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/output/dense/bias/adam_v ['electra', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'dense', 'kernel'] electra/encoder/layer_6/output/dense/kernel\n",
      "Skipping electra/encoder/layer_6/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_6/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_7/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_7/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_7/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_7/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_7/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_7/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_7/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_7/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_7/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_7/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_7/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_7/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_7/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_7/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_7/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_7/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias'] electra/encoder/layer_7/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_7/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_7/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_7/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'dense', 'bias'] electra/encoder/layer_7/output/dense/bias\n",
      "Skipping electra/encoder/layer_7/output/dense/bias/adam_m ['electra', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/output/dense/bias/adam_v ['electra', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'dense', 'kernel'] electra/encoder/layer_7/output/dense/kernel\n",
      "Skipping electra/encoder/layer_7/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_7/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_8/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_8/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_8/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_8/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_8/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_8/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_8/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_8/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_8/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_8/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_8/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_8/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_8/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_8/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_8/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_8/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias'] electra/encoder/layer_8/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_8/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_8/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_8/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'dense', 'bias'] electra/encoder/layer_8/output/dense/bias\n",
      "Skipping electra/encoder/layer_8/output/dense/bias/adam_m ['electra', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/output/dense/bias/adam_v ['electra', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'dense', 'kernel'] electra/encoder/layer_8/output/dense/kernel\n",
      "Skipping electra/encoder/layer_8/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_8/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_9/attention/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_9/attention/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias'] electra/encoder/layer_9/attention/output/dense/bias\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel'] electra/encoder/layer_9/attention/output/dense/kernel\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias'] electra/encoder/layer_9/attention/self/key/bias\n",
      "Skipping electra/encoder/layer_9/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel'] electra/encoder/layer_9/attention/self/key/kernel\n",
      "Skipping electra/encoder/layer_9/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias'] electra/encoder/layer_9/attention/self/query/bias\n",
      "Skipping electra/encoder/layer_9/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel'] electra/encoder/layer_9/attention/self/query/kernel\n",
      "Skipping electra/encoder/layer_9/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias'] electra/encoder/layer_9/attention/self/value/bias\n",
      "Skipping electra/encoder/layer_9/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel'] electra/encoder/layer_9/attention/self/value/kernel\n",
      "Skipping electra/encoder/layer_9/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias'] electra/encoder/layer_9/intermediate/dense/bias\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel'] electra/encoder/layer_9/intermediate/dense/kernel\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta'] electra/encoder/layer_9/output/LayerNorm/beta\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma'] electra/encoder/layer_9/output/LayerNorm/gamma\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'dense', 'bias'] electra/encoder/layer_9/output/dense/bias\n",
      "Skipping electra/encoder/layer_9/output/dense/bias/adam_m ['electra', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/output/dense/bias/adam_v ['electra', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'dense', 'kernel'] electra/encoder/layer_9/output/dense/kernel\n",
      "Skipping electra/encoder/layer_9/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/encoder/layer_9/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Skipping generator/embeddings_project/bias ['generator', 'embeddings_project', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/embeddings_project/bias/adam_m ['generator', 'embeddings_project', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/embeddings_project/bias/adam_v ['generator', 'embeddings_project', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/embeddings_project/kernel ['generator', 'embeddings_project', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/embeddings_project/kernel/adam_m ['generator', 'embeddings_project', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/embeddings_project/kernel/adam_v ['generator', 'embeddings_project', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/bias ['generator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/kernel ['generator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/bias ['generator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/kernel ['generator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/bias ['generator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/kernel ['generator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/bias ['generator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/kernel ['generator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/bias ['generator', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/kernel ['generator', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/beta ['generator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/gamma ['generator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/dense/bias ['generator', 'encoder', 'layer_0', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/dense/bias/adam_m ['generator', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/dense/bias/adam_v ['generator', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/dense/kernel ['generator', 'encoder', 'layer_0', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_0/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/bias ['generator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/kernel ['generator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/bias ['generator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/kernel ['generator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/bias ['generator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/kernel ['generator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/bias ['generator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/kernel ['generator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/bias ['generator', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/kernel ['generator', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/beta ['generator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/gamma ['generator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/dense/bias ['generator', 'encoder', 'layer_1', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/dense/bias/adam_m ['generator', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/dense/bias/adam_v ['generator', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/dense/kernel ['generator', 'encoder', 'layer_1', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_1/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/bias ['generator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/kernel ['generator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/bias ['generator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/kernel ['generator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/bias ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/kernel ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/bias ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/kernel ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/bias ['generator', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/kernel ['generator', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/beta ['generator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/gamma ['generator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/dense/bias ['generator', 'encoder', 'layer_10', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/dense/bias/adam_m ['generator', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/dense/bias/adam_v ['generator', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/dense/kernel ['generator', 'encoder', 'layer_10', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_10/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/bias ['generator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/kernel ['generator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/bias ['generator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/kernel ['generator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/bias ['generator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/bias/adam_v"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.modeling_electra:Skipping global_step\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " ['generator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/kernel ['generator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/bias ['generator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/kernel ['generator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/bias ['generator', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/kernel ['generator', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/beta ['generator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/gamma ['generator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/dense/bias ['generator', 'encoder', 'layer_11', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/dense/bias/adam_m ['generator', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/dense/bias/adam_v ['generator', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/dense/kernel ['generator', 'encoder', 'layer_11', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_11/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/bias ['generator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/kernel ['generator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/bias ['generator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/kernel ['generator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/bias ['generator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/kernel ['generator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/bias ['generator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/kernel ['generator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/bias ['generator', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/kernel ['generator', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/beta ['generator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/gamma ['generator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/dense/bias ['generator', 'encoder', 'layer_2', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/dense/bias/adam_m ['generator', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/dense/bias/adam_v ['generator', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/dense/kernel ['generator', 'encoder', 'layer_2', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_2/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/bias ['generator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/kernel ['generator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/bias ['generator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/kernel ['generator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/bias ['generator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/kernel ['generator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/bias ['generator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/kernel ['generator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/bias ['generator', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/kernel ['generator', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/beta ['generator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/gamma ['generator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/dense/bias ['generator', 'encoder', 'layer_3', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/dense/bias/adam_m ['generator', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/dense/bias/adam_v ['generator', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/dense/kernel ['generator', 'encoder', 'layer_3', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_3/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/bias ['generator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/kernel ['generator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/bias ['generator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/kernel ['generator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/bias ['generator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/kernel ['generator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/bias ['generator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/kernel ['generator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/bias ['generator', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/kernel ['generator', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/beta ['generator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/gamma ['generator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/dense/bias ['generator', 'encoder', 'layer_4', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/dense/bias/adam_m ['generator', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/dense/bias/adam_v ['generator', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/dense/kernel ['generator', 'encoder', 'layer_4', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_4/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/bias ['generator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/kernel ['generator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/bias ['generator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/kernel ['generator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/bias ['generator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/kernel ['generator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/bias ['generator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/kernel ['generator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/bias ['generator', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/kernel ['generator', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/beta ['generator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/gamma ['generator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/dense/bias ['generator', 'encoder', 'layer_5', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/dense/bias/adam_m ['generator', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/dense/bias/adam_v ['generator', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/dense/kernel ['generator', 'encoder', 'layer_5', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_5/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/bias ['generator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/kernel ['generator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/bias ['generator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/kernel ['generator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/bias ['generator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/kernel ['generator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/bias ['generator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/kernel ['generator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/bias ['generator', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/kernel ['generator', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/beta ['generator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/gamma ['generator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/dense/bias ['generator', 'encoder', 'layer_6', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/dense/bias/adam_m ['generator', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/dense/bias/adam_v ['generator', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/dense/kernel ['generator', 'encoder', 'layer_6', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_6/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/bias ['generator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/kernel ['generator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/bias ['generator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/kernel ['generator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/bias ['generator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/kernel ['generator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/bias ['generator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/kernel ['generator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/bias ['generator', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/kernel ['generator', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/beta ['generator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/gamma ['generator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/dense/bias ['generator', 'encoder', 'layer_7', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/dense/bias/adam_m ['generator', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/dense/bias/adam_v ['generator', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/dense/kernel ['generator', 'encoder', 'layer_7', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_7/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/bias ['generator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/kernel ['generator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/bias ['generator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/kernel ['generator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/bias ['generator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/kernel ['generator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/bias ['generator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/kernel ['generator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/bias ['generator', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/kernel ['generator', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/beta ['generator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/gamma ['generator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/dense/bias ['generator', 'encoder', 'layer_8', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/dense/bias/adam_m ['generator', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/dense/bias/adam_v ['generator', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/dense/kernel ['generator', 'encoder', 'layer_8', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_8/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/beta ['generator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/gamma ['generator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/bias ['generator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/bias/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/bias/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/kernel ['generator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/bias ['generator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/bias/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/bias/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/kernel ['generator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/kernel/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/kernel/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/bias ['generator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/bias/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/bias/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/kernel ['generator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/kernel/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/kernel/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/bias ['generator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/bias/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/bias/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/kernel ['generator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/kernel/adam_m ['generator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/kernel/adam_v ['generator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/bias ['generator', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/bias/adam_m ['generator', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/bias/adam_v ['generator', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/kernel ['generator', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/kernel/adam_m ['generator', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/kernel/adam_v ['generator', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/beta ['generator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/beta/adam_m ['generator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/beta/adam_v ['generator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/gamma ['generator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/gamma/adam_m ['generator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/gamma/adam_v ['generator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/dense/bias ['generator', 'encoder', 'layer_9', 'output', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/dense/bias/adam_m ['generator', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/dense/bias/adam_v ['generator', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/dense/kernel ['generator', 'encoder', 'layer_9', 'output', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/dense/kernel/adam_m ['generator', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator/encoder/layer_9/output/dense/kernel/adam_v ['generator', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator'\n",
      "Skipping generator_predictions/LayerNorm/beta ['generator_predictions', 'LayerNorm', 'beta'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/LayerNorm/beta/adam_m ['generator_predictions', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/LayerNorm/beta/adam_v ['generator_predictions', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/LayerNorm/gamma ['generator_predictions', 'LayerNorm', 'gamma'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/LayerNorm/gamma/adam_m ['generator_predictions', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/LayerNorm/gamma/adam_v ['generator_predictions', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/dense/bias ['generator_predictions', 'dense', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/dense/bias/adam_m ['generator_predictions', 'dense', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/dense/bias/adam_v ['generator_predictions', 'dense', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/dense/kernel ['generator_predictions', 'dense', 'kernel'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/dense/kernel/adam_m ['generator_predictions', 'dense', 'kernel', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/dense/kernel/adam_v ['generator_predictions', 'dense', 'kernel', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator_predictions'\n",
      "Skipping generator_predictions/output_bias ['generator_lm_head', 'bias'] 'ElectraForPreTraining' object has no attribute 'generator_lm_head'\n",
      "Skipping generator_predictions/output_bias/adam_m ['generator_lm_head', 'bias', 'adam_m'] 'ElectraForPreTraining' object has no attribute 'generator_lm_head'\n",
      "Skipping generator_predictions/output_bias/adam_v ['generator_lm_head', 'bias', 'adam_v'] 'ElectraForPreTraining' object has no attribute 'generator_lm_head'\n",
      "Save PyTorch model to electra-base-discriminator-bahasa-cased/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "convert_tf_checkpoint_to_pytorch('out/model.ckpt-60000', \n",
    "                                 'BASE-config-discriminator.json', \n",
    "                                 'electra-base-discriminator-bahasa-cased/pytorch_model.bin',\n",
    "                                'discriminator')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.tokenization_utils:Model name './electra-base-discriminator-bahasa-cased' not found in model shortcut name list (google/electra-small-generator, google/electra-base-generator, google/electra-large-generator, google/electra-small-discriminator, google/electra-base-discriminator, google/electra-large-discriminator). Assuming './electra-base-discriminator-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.\n",
      "INFO:transformers.tokenization_utils:Didn't find file ./electra-base-discriminator-bahasa-cased/added_tokens.json. We won't load it.\n",
      "INFO:transformers.tokenization_utils:loading file ./electra-base-discriminator-bahasa-cased/vocab.txt\n",
      "INFO:transformers.tokenization_utils:loading file None\n",
      "INFO:transformers.tokenization_utils:loading file ./electra-base-discriminator-bahasa-cased/special_tokens_map.json\n",
      "INFO:transformers.tokenization_utils:loading file ./electra-base-discriminator-bahasa-cased/tokenizer_config.json\n"
     ]
    }
   ],
   "source": [
    "tokenizer = ElectraTokenizer.from_pretrained('./electra-base-discriminator-bahasa-cased', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ElectraConfig {\n",
       "  \"_num_labels\": 2,\n",
       "  \"architectures\": null,\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"bad_words_ids\": null,\n",
       "  \"bos_token_id\": null,\n",
       "  \"decoder_start_token_id\": null,\n",
       "  \"do_sample\": false,\n",
       "  \"early_stopping\": false,\n",
       "  \"embedding_size\": 768,\n",
       "  \"eos_token_id\": null,\n",
       "  \"finetuning_task\": null,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
       "  \"hidden_size\": 768,\n",
       "  \"id2label\": {\n",
       "    \"0\": \"LABEL_0\",\n",
       "    \"1\": \"LABEL_1\"\n",
       "  },\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 3072,\n",
       "  \"is_decoder\": false,\n",
       "  \"is_encoder_decoder\": false,\n",
       "  \"label2id\": {\n",
       "    \"LABEL_0\": 0,\n",
       "    \"LABEL_1\": 1\n",
       "  },\n",
       "  \"layer_norm_eps\": 1e-12,\n",
       "  \"length_penalty\": 1.0,\n",
       "  \"max_length\": 20,\n",
       "  \"max_position_embeddings\": 512,\n",
       "  \"min_length\": 0,\n",
       "  \"model_type\": \"electra\",\n",
       "  \"no_repeat_ngram_size\": 0,\n",
       "  \"num_attention_heads\": 12,\n",
       "  \"num_beams\": 1,\n",
       "  \"num_hidden_layers\": 12,\n",
       "  \"num_return_sequences\": 1,\n",
       "  \"output_attentions\": false,\n",
       "  \"output_hidden_states\": false,\n",
       "  \"output_past\": true,\n",
       "  \"pad_token_id\": 0,\n",
       "  \"prefix\": null,\n",
       "  \"pruned_heads\": {},\n",
       "  \"repetition_penalty\": 1.0,\n",
       "  \"task_specific_params\": null,\n",
       "  \"temperature\": 1.0,\n",
       "  \"top_k\": 50,\n",
       "  \"top_p\": 1.0,\n",
       "  \"torchscript\": false,\n",
       "  \"type_vocab_size\": 2,\n",
       "  \"use_bfloat16\": false,\n",
       "  \"vocab_size\": 32000\n",
       "}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config = ElectraConfig('BASE-config-discriminator.json')\n",
    "config.vocab_size = 32000\n",
    "config.hidden_size = 768\n",
    "config.intermediate_size = 3072\n",
    "config.num_attention_heads = 12\n",
    "config.embedding_size = 768\n",
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.modeling_utils:loading weights file ./electra-base-discriminator-bahasa-cased/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "from transformers import ElectraForPreTraining, ElectraTokenizerFast\n",
    "import torch\n",
    "\n",
    "discriminator = ElectraForPreTraining.from_pretrained('./electra-base-discriminator-bahasa-cased/pytorch_model.bin',\n",
    "                                                     config = config)\n",
    "\n",
    "sentence = '1mbd menjejaskan imej negara'\n",
    "fake_sentence = '1mbd menaikkan imej negara'\n",
    "\n",
    "fake_tokens = tokenizer.tokenize(fake_sentence)\n",
    "fake_inputs = tokenizer.encode(fake_sentence, return_tensors=\"pt\")\n",
    "discriminator_outputs = discriminator(fake_inputs)\n",
    "predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.configuration_utils:Configuration saved in electra-base-discriminator-bahasa-cased/config.json\n",
      "INFO:transformers.modeling_utils:Model weights saved in electra-base-discriminator-bahasa-cased/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "discriminator.save_pretrained('electra-base-discriminator-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli upload ./electra-base-discriminator-bahasa-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = ElectraForPreTraining.from_pretrained('huseinzol05/electra-base-discriminator-bahasa-cased')\n",
    "tokenizer = ElectraTokenizer.from_pretrained('huseinzol05/electra-base-discriminator-bahasa-cased', \n",
    "                                             do_lower_case = False)\n",
    "\n",
    "sentence = 'kerajaan sangat prihatin terhadap rakyat'\n",
    "fake_tokens = tokenizer.tokenize(sentence)\n",
    "fake_inputs = tokenizer.encode(sentence, return_tensors=\"pt\")\n",
    "discriminator_outputs = discriminator(fake_inputs)\n",
    "predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)\n",
    "\n",
    "list(zip(fake_tokens, predictions.tolist()))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
